_ : https://www.definitions-marketing.com/definition/segmentation-rfm/
_ : https://www.kaggle.com/code/kautumn06/yellowbrick-clustering-evaluation-examples/notebook
_ : https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html
_ : https://towardsdatascience.com/an-rfm-customer-segmentation-with-python-cf7be647733d
_ : https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17
- : https://machinelearningmastery.com/calculate-feature-importance-with-python/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option("display.max_column", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", None)
import matplotlib.style
plt.style.use('fast')
data = pd.read_csv('clean_dataset.csv')
# List of categorical features
columns_to_convert_to_categorical = [
'customer_id',
'customer_unique_id',
'customer_city',
'customer_state',
'order_id',
'customer_id',
'order_status',
'order_id',
'product_id',
'seller_id',
'order_id',
'payment_type',
'review_id',
'order_id',
'product_id',
'product_category_name',
'seller_id',
'product_category_name'
]
# List of columns to convert to datetime
columns_to_convert_to_datetime = [
'order_purchase_timestamp',
'order_approved_at',
'order_delivered_carrier_date',
'order_delivered_customer_date',
'order_estimated_delivery_date',
'shipping_limit_date',
'review_creation_date',
'review_answer_timestamp']
# Converting dtype to datetime features
for column_name in columns_to_convert_to_datetime:
data[column_name] = data[column_name].astype('datetime64[ns]')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119143 entries, 0 to 119142 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 119143 non-null int64 1 order_id 119143 non-null object 2 customer_id 119143 non-null object 3 order_status 119143 non-null object 4 order_purchase_timestamp 119143 non-null datetime64[ns] 5 order_approved_at 118966 non-null datetime64[ns] 6 order_delivered_carrier_date 117057 non-null datetime64[ns] 7 order_delivered_customer_date 115722 non-null datetime64[ns] 8 order_estimated_delivery_date 119143 non-null datetime64[ns] 9 review_id 118146 non-null object 10 review_score 118146 non-null float64 11 review_creation_date 118146 non-null datetime64[ns] 12 review_answer_timestamp 118146 non-null datetime64[ns] 13 payment_sequential 119140 non-null float64 14 payment_type 119137 non-null object 15 payment_installments 119140 non-null float64 16 payment_value 119140 non-null float64 17 customer_unique_id 119143 non-null object 18 customer_city 119143 non-null object 19 customer_state 119143 non-null object 20 geolocation_zip_code_prefix 118820 non-null float64 21 customer_lat 118820 non-null float64 22 customer_lng 118820 non-null float64 23 geolocation_city 118820 non-null object 24 geolocation_state 118820 non-null object 25 order_item_id 118310 non-null float64 26 product_id 118310 non-null object 27 seller_id 118310 non-null object 28 shipping_limit_date 118310 non-null datetime64[ns] 29 price 118310 non-null float64 30 freight_value 117920 non-null float64 31 product_name_lenght 116601 non-null float64 32 product_description_lenght 116601 non-null float64 33 product_photos_qty 116601 non-null float64 34 product_weight_g 118290 non-null float64 35 product_length_cm 118290 non-null float64 36 product_height_cm 118290 non-null float64 37 product_width_cm 118290 non-null float64 38 product_category_name 116576 non-null object 39 volume 118290 non-null float64 40 delivery_time 115707 non-null float64 41 estimated_delivery_time 118966 non-null float64 42 approval_time 118966 non-null float64 43 purchase_year 119143 non-null int64 44 purchase_month 119143 non-null int64 dtypes: datetime64[ns](8), float64(21), int64(3), object(13) memory usage: 40.9+ MB
data = data.drop(columns=['Unnamed: 0'])
df_missing = data.isna().sum().sort_values()/len(data)*100
df_missing
order_id 0.000000 purchase_year 0.000000 customer_state 0.000000 customer_city 0.000000 customer_unique_id 0.000000 order_estimated_delivery_date 0.000000 purchase_month 0.000000 customer_id 0.000000 order_purchase_timestamp 0.000000 order_status 0.000000 payment_value 0.002518 payment_sequential 0.002518 payment_installments 0.002518 payment_type 0.005036 approval_time 0.148561 estimated_delivery_time 0.148561 order_approved_at 0.148561 geolocation_city 0.271103 customer_lat 0.271103 customer_lng 0.271103 geolocation_state 0.271103 geolocation_zip_code_prefix 0.271103 shipping_limit_date 0.699160 price 0.699160 seller_id 0.699160 product_id 0.699160 order_item_id 0.699160 volume 0.715946 product_width_cm 0.715946 product_height_cm 0.715946 product_weight_g 0.715946 product_length_cm 0.715946 review_creation_date 0.836810 review_answer_timestamp 0.836810 review_score 0.836810 review_id 0.836810 freight_value 1.026498 order_delivered_carrier_date 1.750837 product_photos_qty 2.133571 product_name_lenght 2.133571 product_description_lenght 2.133571 product_category_name 2.154554 order_delivered_customer_date 2.871339 delivery_time 2.883929 dtype: float64
data.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | review_id | review_score | review_creation_date | review_answer_timestamp | payment_sequential | payment_type | payment_installments | payment_value | customer_unique_id | customer_city | customer_state | geolocation_zip_code_prefix | customer_lat | customer_lng | geolocation_city | geolocation_state | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_category_name | volume | delivery_time | estimated_delivery_time | approval_time | purchase_year | purchase_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | a54f0611adc9ed256b57ede6b6eb5114 | 4.0 | 2017-10-11 | 2017-10-12 03:43:48 | 1.0 | credit_card | 1.0 | 18.12 | 7c396fd4830fd04220f754e42b4e5bff | sao paulo | SP | 3149.0 | -23.574809 | -46.587471 | sao paulo | SP | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 3.400864 | 2.165619 | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | housewares | 1976.0 | 8.429144 | 15.536632 | 0.007431 | 2017 | 10 |
| 1 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | a54f0611adc9ed256b57ede6b6eb5114 | 4.0 | 2017-10-11 | 2017-10-12 03:43:48 | 3.0 | voucher | 1.0 | 2.00 | 7c396fd4830fd04220f754e42b4e5bff | sao paulo | SP | 3149.0 | -23.574809 | -46.587471 | sao paulo | SP | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 3.400864 | 2.165619 | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | housewares | 1976.0 | 8.429144 | 15.536632 | 0.007431 | 2017 | 10 |
| 2 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | a54f0611adc9ed256b57ede6b6eb5114 | 4.0 | 2017-10-11 | 2017-10-12 03:43:48 | 2.0 | voucher | 1.0 | 18.59 | 7c396fd4830fd04220f754e42b4e5bff | sao paulo | SP | 3149.0 | -23.574809 | -46.587471 | sao paulo | SP | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 3.400864 | 2.165619 | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | housewares | 1976.0 | 8.429144 | 15.536632 | 0.007431 | 2017 | 10 |
| 3 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | 8d5266042046a06655c8db133d120ba5 | 4.0 | 2018-08-08 | 2018-08-08 18:37:50 | 1.0 | boleto | 1.0 | 141.46 | af07308b275d755c9edb36a90c618231 | barreiras | BA | 47813.0 | -12.169860 | -44.988369 | barreiras | BA | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 4.776599 | 3.125005 | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | perfumery | 4693.0 | 12.502292 | 17.858021 | 1.279745 | 2018 | 7 |
| 4 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | e73b67b67587f7644d5bd1a52deb1b01 | 5.0 | 2018-08-18 | 2018-08-22 19:07:58 | 1.0 | credit_card | 3.0 | 179.12 | 3a653a41f6f9fc3d2a113cf8398680e8 | vianopolis | GO | 75265.0 | -16.746337 | -48.514624 | vianopolis | GO | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 5.074549 | 2.955951 | 46.0 | 232.0 | 1.0 | 420.0 | 24.0 | 19.0 | 21.0 | auto | 9576.0 | 9.382708 | 26.628206 | 0.011505 | 2018 | 8 |
#data.loc[:, ['customer_unique_id', "order_id"]]
data['order_id']
0 e481f51cbdc54678b7cc49136f2d6af7
1 e481f51cbdc54678b7cc49136f2d6af7
2 e481f51cbdc54678b7cc49136f2d6af7
3 53cdb2fc8bc7dce0b6741e2150273451
4 47770eb9100c2d0c44946d9cf07ec65d
...
119138 63943bddc261676b46f01ca7ac2f7bd8
119139 83c1379a015df1e13d02aae0204711ab
119140 11c177c8e97725db2631073c19f07b62
119141 11c177c8e97725db2631073c19f07b62
119142 66dea50a8b16d9b4dee7af250b4be1a5
Name: order_id, Length: 119143, dtype: object
data["customer_unique_id"]
0 7c396fd4830fd04220f754e42b4e5bff
1 7c396fd4830fd04220f754e42b4e5bff
2 7c396fd4830fd04220f754e42b4e5bff
3 af07308b275d755c9edb36a90c618231
4 3a653a41f6f9fc3d2a113cf8398680e8
...
119138 da62f9e57a76d978d02ab5362c509660
119139 737520a9aad80b3fbbdad19b66b37b30
119140 5097a5312c8b157bb7be58ae360ef43c
119141 5097a5312c8b157bb7be58ae360ef43c
119142 60350aa974b26ff12caad89e55993bd6
Name: customer_unique_id, Length: 119143, dtype: object
data['payment_value']
0 18.12
1 2.00
2 18.59
3 141.46
4 179.12
...
119138 195.00
119139 271.01
119140 441.16
119141 441.16
119142 86.86
Name: payment_value, Length: 119143, dtype: float64
data.set_index('customer_unique_id', inplace=True)
now = data['order_purchase_timestamp'].max()
rfm = data.groupby(['customer_unique_id']).agg({
'order_purchase_timestamp': lambda x: (now - x.max()).days,
'order_id':'count',
'payment_value':'sum'})
rfm.rename(columns={'order_purchase_timestamp': 'RecencyScore',
'order_id': 'FrequencyScore',
'payment_value': 'MonetaryScore'}, inplace=True)
rfm['FrequencyScore'] = rfm['FrequencyScore'][rfm['FrequencyScore'] > 0]
rfm['FrequencyScore'] = np.log(rfm['FrequencyScore'])
rfm['MonetaryScore'] = rfm['MonetaryScore'][rfm['MonetaryScore'] > 0]
rfm['MonetaryScore'] = np.log(rfm['MonetaryScore'])
rfm['MonetaryScore']
customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2 4.955123
0000b849f77a49e4a4ce2b2a4ca5be3f 3.302849
0000f46a3911fa3c0805444483337064 4.456902
0000f6ccb0745a6a4b88665a16c9f078 3.775516
0004aac84e0df4da2b147fca70cf8255 5.282645
...
fffcf5a5ff07b0908bd4e2dbc735a684 8.327204
fffea47cd6d3cc0a88bd621562a9d061 4.437698
ffff371b4d645b6ecea244b27531430a 4.722598
ffff5962728ec6157033ef9805bacc48 4.895524
ffffd2657e2aad2907e67c3e9daecbeb 4.270536
Name: MonetaryScore, Length: 96096, dtype: float64
Récupération de l'identifiant, de la date et de la valeur de chaque commande. Après jointure sur l'identifiant client on peut calculer les variables RFM
fig = plt.figure(1, figsize=(15, 4))
plt.subplot(1, 3, 1)
sns.distplot(rfm.RecencyScore, label='Recency Score',
bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")
plt.subplot(1, 3, 2)
sns.distplot(rfm.FrequencyScore, label='Frequency Score',
bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")
plt.subplot(1, 3, 3)
sns.distplot(rfm.MonetaryScore, label='Monetary Score',
bins=int(1 + np.log2(len(rfm))))
plt.ylabel("%")
plt.style.use('ggplot')
plt.tight_layout()
plt.show()
fig = plt.figure(1, figsize=(18, 4))
plt.subplot(131)
sns.boxplot(rfm["RecencyScore"])
plt.subplot(132)
sns.boxplot(rfm["FrequencyScore"])
plt.subplot(133)
sns.boxplot(rfm["MonetaryScore"])
plt.show()
rfm.describe()
| RecencyScore | FrequencyScore | MonetaryScore | |
|---|---|---|---|
| count | 96096.000000 | 96096.000000 | 96093.000000 |
| mean | 287.735691 | 0.131395 | 4.805777 |
| std | 153.414676 | 0.339076 | 0.918514 |
| min | 0.000000 | 0.000000 | 2.260721 |
| 25% | 163.000000 | 0.000000 | 4.158883 |
| 50% | 268.000000 | 0.000000 | 4.730392 |
| 75% | 397.000000 | 0.000000 | 5.315322 |
| max | 772.000000 | 4.317488 | 11.601967 |
rfm.head(5)
| RecencyScore | FrequencyScore | MonetaryScore | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 0.0 | 4.955123 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 0.0 | 3.302849 |
| 0000f46a3911fa3c0805444483337064 | 585 | 0.0 | 4.456902 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 369 | 0.0 | 3.775516 |
| 0004aac84e0df4da2b147fca70cf8255 | 336 | 0.0 | 5.282645 |
rfm.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RecencyScore 96096 non-null int64 1 FrequencyScore 96096 non-null float64 2 MonetaryScore 96093 non-null float64 dtypes: float64(2), int64(1) memory usage: 4.9+ MB
Le datset est aggrégé par clients unique. Les variables catégorielles ne peuvent pas être aggrégés, on va les supprimer
# Create shipping_time variable
data['shipping_time'] = data['shipping_limit_date'] \
- data['order_delivered_carrier_date']
data['shipping_time'] = data['shipping_time'] \
.dt.total_seconds() / 86400
# Create review_delay variable
data['review_delay'] = data['review_creation_date'] \
- data['order_purchase_timestamp']
data['review_delay'] = data['review_delay'] \
.dt.total_seconds() / 86400
# Create review_answer_delay variable
data['review_answer_delay'] = data['review_answer_timestamp'] \
- data['review_creation_date']
data['review_answer_delay'] = data['review_answer_delay'] \
.dt.total_seconds() / 86400
# Features to aggregate
features = ['review_creation_date',
'review_answer_timestamp',
'estimated_delivery_time',
'delivery_time',
'approval_time',
'purchase_year',
'purchase_month',
'order_purchase_timestamp',
'payment_sequential',
'freight_value',
'price',
'payment_value',
'order_item_id',
'product_weight_g',
'product_photos_qty',
'product_name_lenght',
'volume',
'shipping_limit_date',
'order_delivered_carrier_date',
'shipping_time',
'review_delay',
'review_answer_delay'
]
model = data.groupby('customer_unique_id').agg({'review_creation_date': "mean",
'review_answer_timestamp': "mean",
'estimated_delivery_time': "mean",
'delivery_time': 'mean',
'approval_time': 'mean',
'purchase_year': 'mean',
'purchase_month': 'mean',
'order_purchase_timestamp': 'mean',
'payment_sequential': "sum",
'freight_value': "sum",
'price': 'sum',
'payment_value': "sum",
'order_item_id': 'sum',
'product_weight_g': "mean",
'product_photos_qty': 'mean',
'product_name_lenght': 'mean',
'volume': "mean",
'shipping_limit_date': "mean",
'order_delivered_carrier_date': "mean",
'shipping_time':"mean",
'review_delay':"mean",
'review_answer_delay':"mean",
})
model.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review_creation_date 95380 non-null datetime64[ns] 1 review_answer_timestamp 95380 non-null datetime64[ns] 2 estimated_delivery_time 95997 non-null float64 3 delivery_time 93343 non-null float64 4 approval_time 95997 non-null float64 5 purchase_year 96096 non-null float64 6 purchase_month 96096 non-null float64 7 order_purchase_timestamp 96096 non-null datetime64[ns] 8 payment_sequential 96096 non-null float64 9 freight_value 96096 non-null float64 10 price 96096 non-null float64 11 payment_value 96096 non-null float64 12 order_item_id 96096 non-null float64 13 product_weight_g 95407 non-null float64 14 product_photos_qty 94108 non-null float64 15 product_name_lenght 94108 non-null float64 16 volume 95407 non-null float64 17 shipping_limit_date 95420 non-null datetime64[ns] 18 order_delivered_carrier_date 94466 non-null datetime64[ns] 19 shipping_time 94465 non-null float64 20 review_delay 95380 non-null float64 21 review_answer_delay 95380 non-null float64 dtypes: datetime64[ns](5), float64(17) memory usage: 16.9+ MB
model.columns = features
#model = model.join(rfm[['RecencyScore','FrequencyScore','MonetaryScore']])
model.head()
| review_creation_date | review_answer_timestamp | estimated_delivery_time | delivery_time | approval_time | purchase_year | purchase_month | order_purchase_timestamp | payment_sequential | freight_value | price | payment_value | order_item_id | product_weight_g | product_photos_qty | product_name_lenght | volume | shipping_limit_date | order_delivered_carrier_date | shipping_time | review_delay | review_answer_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| customer_unique_id | ||||||||||||||||||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 2018-05-17 | 2018-05-21 13:44:48 | 10.533819 | 6.400914 | 0.010313 | 2018.0 | 5.0 | 2018-05-10 10:56:27 | 1.0 | 2.484907 | 4.866765 | 141.90 | 1.0 | 1500.0 | 1.0 | 60.0 | 7616.0 | 2018-05-15 11:11:18 | 2018-05-12 08:18:00 | 3.120347 | 6.544132 | 4.572778 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 2018-05-11 | 2018-05-11 22:29:25 | 7.232130 | 2.984005 | 0.301586 | 2018.0 | 5.0 | 2018-05-07 11:11:27 | 1.0 | 2.115050 | 2.939162 | 27.19 | 1.0 | 375.0 | 1.0 | 56.0 | 5148.0 | 2018-05-11 17:56:33 | 2018-05-09 12:18:00 | 2.235104 | 3.533715 | 0.937095 |
| 0000f46a3911fa3c0805444483337064 | 2017-04-06 | 2017-04-07 01:19:44 | 27.121493 | 25.731759 | 0.000000 | 2017.0 | 3.0 | 2017-03-10 21:05:03 | 1.0 | 2.846071 | 4.234107 | 86.22 | 1.0 | 1500.0 | 3.0 | 49.0 | 43750.0 | 2017-03-15 21:05:03 | 2017-03-13 12:58:30 | 2.337882 | 26.121493 | 1.055370 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 2017-11-02 | 2017-11-03 00:17:51 | 31.132442 | 20.023472 | 0.013611 | 2017.0 | 10.0 | 2017-10-12 20:29:41 | 1.0 | 2.869602 | 3.257712 | 43.62 | 1.0 | 150.0 | 5.0 | 43.0 | 1045.0 | 2017-10-18 21:49:17 | 2017-10-13 20:08:19 | 5.070116 | 20.146053 | 1.012396 |
| 0004aac84e0df4da2b147fca70cf8255 | 2017-11-28 | 2017-12-02 22:29:35 | 20.161898 | 13.126435 | 0.014699 | 2017.0 | 11.0 | 2017-11-14 19:45:42 | 1.0 | 2.826722 | 5.192957 | 196.89 | 1.0 | 6050.0 | 3.0 | 58.0 | 528.0 | 2017-11-22 20:06:52 | 2017-11-16 19:52:10 | 6.010208 | 13.176597 | 4.937211 |
model.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 review_creation_date 95380 non-null datetime64[ns] 1 review_answer_timestamp 95380 non-null datetime64[ns] 2 estimated_delivery_time 95997 non-null float64 3 delivery_time 93343 non-null float64 4 approval_time 95997 non-null float64 5 purchase_year 96096 non-null float64 6 purchase_month 96096 non-null float64 7 order_purchase_timestamp 96096 non-null datetime64[ns] 8 payment_sequential 96096 non-null float64 9 freight_value 96096 non-null float64 10 price 96096 non-null float64 11 payment_value 96096 non-null float64 12 order_item_id 96096 non-null float64 13 product_weight_g 95407 non-null float64 14 product_photos_qty 94108 non-null float64 15 product_name_lenght 94108 non-null float64 16 volume 95407 non-null float64 17 shipping_limit_date 95420 non-null datetime64[ns] 18 order_delivered_carrier_date 94466 non-null datetime64[ns] 19 shipping_time 94465 non-null float64 20 review_delay 95380 non-null float64 21 review_answer_delay 95380 non-null float64 dtypes: datetime64[ns](5), float64(17) memory usage: 16.9+ MB
# percentage of nan for each column of data
df_missing = model.isna().sum().sort_values()/len(data)*100
df_missing
price 0.000000 purchase_year 0.000000 purchase_month 0.000000 order_purchase_timestamp 0.000000 payment_sequential 0.000000 freight_value 0.000000 payment_value 0.000000 order_item_id 0.000000 estimated_delivery_time 0.083093 approval_time 0.083093 shipping_limit_date 0.567385 volume 0.578297 product_weight_g 0.578297 review_creation_date 0.600959 review_answer_timestamp 0.600959 review_delay 0.600959 review_answer_delay 0.600959 order_delivered_carrier_date 1.368104 shipping_time 1.368943 product_photos_qty 1.668583 product_name_lenght 1.668583 delivery_time 2.310669 dtype: float64
sample = model.join(rfm[['RecencyScore','FrequencyScore','MonetaryScore']])
sample
| review_creation_date | review_answer_timestamp | estimated_delivery_time | delivery_time | approval_time | purchase_year | purchase_month | order_purchase_timestamp | payment_sequential | freight_value | price | payment_value | order_item_id | product_weight_g | product_photos_qty | product_name_lenght | volume | shipping_limit_date | order_delivered_carrier_date | shipping_time | review_delay | review_answer_delay | RecencyScore | FrequencyScore | MonetaryScore | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| customer_unique_id | |||||||||||||||||||||||||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 2018-05-17 | 2018-05-21 13:44:48 | 10.533819 | 6.400914 | 0.010313 | 2018.0 | 5.0 | 2018-05-10 10:56:27 | 1.0 | 2.484907 | 4.866765 | 141.90 | 1.0 | 1500.0 | 1.0 | 60.0 | 7616.0 | 2018-05-15 11:11:18 | 2018-05-12 08:18:00 | 3.120347 | 6.544132 | 4.572778 | 160 | 0.000000 | 4.955123 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 2018-05-11 | 2018-05-11 22:29:25 | 7.232130 | 2.984005 | 0.301586 | 2018.0 | 5.0 | 2018-05-07 11:11:27 | 1.0 | 2.115050 | 2.939162 | 27.19 | 1.0 | 375.0 | 1.0 | 56.0 | 5148.0 | 2018-05-11 17:56:33 | 2018-05-09 12:18:00 | 2.235104 | 3.533715 | 0.937095 | 163 | 0.000000 | 3.302849 |
| 0000f46a3911fa3c0805444483337064 | 2017-04-06 | 2017-04-07 01:19:44 | 27.121493 | 25.731759 | 0.000000 | 2017.0 | 3.0 | 2017-03-10 21:05:03 | 1.0 | 2.846071 | 4.234107 | 86.22 | 1.0 | 1500.0 | 3.0 | 49.0 | 43750.0 | 2017-03-15 21:05:03 | 2017-03-13 12:58:30 | 2.337882 | 26.121493 | 1.055370 | 585 | 0.000000 | 4.456902 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 2017-11-02 | 2017-11-03 00:17:51 | 31.132442 | 20.023472 | 0.013611 | 2017.0 | 10.0 | 2017-10-12 20:29:41 | 1.0 | 2.869602 | 3.257712 | 43.62 | 1.0 | 150.0 | 5.0 | 43.0 | 1045.0 | 2017-10-18 21:49:17 | 2017-10-13 20:08:19 | 5.070116 | 20.146053 | 1.012396 | 369 | 0.000000 | 3.775516 |
| 0004aac84e0df4da2b147fca70cf8255 | 2017-11-28 | 2017-12-02 22:29:35 | 20.161898 | 13.126435 | 0.014699 | 2017.0 | 11.0 | 2017-11-14 19:45:42 | 1.0 | 2.826722 | 5.192957 | 196.89 | 1.0 | 6050.0 | 3.0 | 58.0 | 528.0 | 2017-11-22 20:06:52 | 2017-11-16 19:52:10 | 6.010208 | 13.176597 | 4.937211 | 336 | 0.000000 | 5.282645 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| fffcf5a5ff07b0908bd4e2dbc735a684 | 2017-07-07 | 2017-07-11 11:20:05 | 54.114398 | 27.504676 | 0.010185 | 2017.0 | 6.0 | 2017-06-08 21:00:36 | 2.0 | 11.031093 | 13.313314 | 4134.84 | 3.0 | 30000.0 | 1.0 | 54.0 | 251625.0 | 2017-06-25 21:15:16 | 2017-06-23 13:03:27 | 2.341539 | 28.124583 | 4.472280 | 495 | 0.693147 | 8.327204 |
| fffea47cd6d3cc0a88bd621562a9d061 | 2018-01-10 | 2018-01-15 11:51:59 | 32.155324 | 30.091667 | 0.005833 | 2017.0 | 12.0 | 2017-12-10 20:07:56 | 1.0 | 2.980111 | 4.172694 | 84.58 | 1.0 | 1720.0 | 2.0 | 37.0 | 32076.0 | 2017-12-14 20:16:20 | 2017-12-21 16:27:01 | -6.840752 | 30.161157 | 5.494433 | 310 | 0.000000 | 4.437698 |
| ffff371b4d645b6ecea244b27531430a | 2017-02-23 | 2017-02-25 15:03:21 | 44.331435 | 14.862731 | 0.009352 | 2017.0 | 2.0 | 2017-02-07 15:49:16 | 1.0 | 3.116178 | 4.498698 | 112.46 | 1.0 | 600.0 | 5.0 | 57.0 | 3315.0 | 2017-02-11 15:49:16 | 2017-02-08 10:04:37 | 3.239340 | 15.340787 | 2.627326 | 617 | 0.000000 | 4.722598 |
| ffff5962728ec6157033ef9805bacc48 | 2018-05-15 | 2018-05-17 15:16:20 | 25.334178 | 11.830313 | 0.028542 | 2018.0 | 5.0 | 2018-05-02 15:17:41 | 1.0 | 2.927989 | 4.744932 | 133.69 | 1.0 | 444.0 | 3.0 | 35.0 | 4480.0 | 2018-05-08 15:58:47 | 2018-05-03 16:45:00 | 4.967905 | 12.362720 | 2.636343 | 168 | 0.000000 | 4.895524 |
| ffffd2657e2aad2907e67c3e9daecbeb | 2017-05-11 | 2017-05-13 10:55:24 | 22.364433 | 6.762905 | 0.789213 | 2017.0 | 5.0 | 2017-05-02 20:18:45 | 1.0 | 2.678965 | 4.042876 | 71.56 | 1.0 | 400.0 | 2.0 | 45.0 | 2178.0 | 2017-05-10 15:15:13 | 2017-05-04 12:05:22 | 6.131840 | 8.153646 | 2.455139 | 532 | 0.000000 | 4.270536 |
96096 rows × 25 columns
# 5% selection of a sample
sample = rfm.sample(frac=1)
sample.head(50)
| RecencyScore | FrequencyScore | MonetaryScore | |
|---|---|---|---|
| customer_unique_id | |||
| cfe59c8aa4d4bd4344db92d818d04c1f | 69 | 0.000000 | 3.837946 |
| 6b6c763cc4d02614d74b0c372127fe13 | 517 | 0.000000 | 4.546481 |
| e6d08c276c45a22381ec076deeb63b13 | 257 | 0.000000 | 4.625561 |
| 0f1190ffa87d71271b42b9df8245258f | 626 | 0.000000 | 4.969813 |
| 448e0dea390e9ed20f330b7e75b12ff3 | 371 | 0.000000 | 4.246207 |
| 65c2b54b418a73fcb36a6b6e6765dd23 | 431 | 0.693147 | 5.842384 |
| c06fd806752b504eca5cea51d4bc0c4c | 184 | 1.386294 | 5.449492 |
| 4f94b2a128e75939c61c188212388e3e | 503 | 0.000000 | 7.252493 |
| 10c3a1c4d4262a949a536157ac3c4f6d | 67 | 0.000000 | 5.722146 |
| 4b0f1a28c6fd7af4705d217c728f22de | 74 | 0.000000 | 5.951814 |
| 12728eaaef44ecf87a76083fd1e442c3 | 345 | 0.693147 | 5.248917 |
| f7b223e244004385231a6e4db45a06a9 | 258 | 0.000000 | 4.636087 |
| dfdcc1f8c27500c34a9fc70695b1b1e4 | 138 | 0.000000 | 5.507321 |
| c8ec99d9532ed61578f3717cb0eff87c | 229 | 0.000000 | 3.988984 |
| 4640720dd73950599ff10a468a10d61e | 174 | 0.000000 | 5.562872 |
| 0becec3df826cb50f107c7a99cb44b90 | 637 | 0.000000 | 6.607488 |
| 7857512dc9a14ef806537f0443fd5d9e | 246 | 0.000000 | 3.924544 |
| bd7287fdad74d4d53c8d2b6e1cfd644d | 156 | 0.000000 | 6.527329 |
| 81eb2077986bca5eba40d0796490073a | 199 | 0.000000 | 4.334542 |
| f84a99a71347da8c4956f56a987211ba | 519 | 0.000000 | 4.863913 |
| 642ac36382354363985c6bfdba639c42 | 249 | 0.000000 | 3.625673 |
| 87307bedec2b4496fc1fac01d1d98a20 | 219 | 0.000000 | 4.670302 |
| 2f76c8ac8310a38b82baafddbf090db8 | 77 | 0.000000 | 4.689787 |
| 2542d967894acdc11dbffa8dd2e82a6a | 154 | 1.098612 | 6.505470 |
| 2e6922b1b345f1606b329d24f3d09136 | 381 | 0.693147 | 3.397189 |
| 6861be7c47819049f96ce098fa072e3a | 212 | 0.000000 | 4.374246 |
| 628bf7b073cfff27a6d6ad849c29a1f2 | 245 | 0.000000 | 6.315087 |
| f6fccceee418fcc63939b6bb5e20128d | 155 | 0.000000 | 3.691875 |
| 4ddabde26cd4b8378ccbdc76daf8b745 | 600 | 0.000000 | 4.396176 |
| c861accb34a721194cc29e7a7e5fe5f0 | 309 | 0.000000 | 4.331785 |
| 787a5c4e16118ce0267bb67520cf8179 | 366 | 0.000000 | 4.175925 |
| afa602f01babaaaa3376444634400a25 | 272 | 0.000000 | 4.535820 |
| f79aa45136d8243492e2eaa67f203425 | 171 | 0.000000 | 4.746236 |
| 56f154706913672dc7f0c7350db0dda2 | 165 | 0.000000 | 4.720729 |
| 5a465fa8d582b46073747a7a0dafb516 | 241 | 0.000000 | 3.613617 |
| d9a99b290182aff8db93595d904ab99c | 625 | 0.693147 | 6.070276 |
| b2387b8bb8016bffddb4ff61265a4fd4 | 406 | 0.000000 | 4.666171 |
| a740a724bc243698bdbf93a8dd68a09a | 305 | 0.000000 | 4.945136 |
| edd176efa31cbfc929efbac163cb45a6 | 368 | 0.693147 | 6.724025 |
| 5cee614881d2b4cccc3378dcc933578d | 349 | 0.000000 | 3.593194 |
| fbb8b4ad21213ec7a23c867b6bcdf843 | 599 | 0.000000 | 3.822973 |
| 981665ad276011294ada8eddaaa56d8f | 333 | 0.000000 | 5.506469 |
| 9b3b3eaa63df7cc2909934af6d2034ba | 245 | 0.000000 | 4.667018 |
| 5a4b4cc7bcccb91c0eba9112d7c4c8bd | 253 | 0.000000 | 4.879083 |
| 947c620fd333114daef626378bb14848 | 326 | 0.000000 | 4.780803 |
| 42d8b87cd525a34a306cca6db06d83a1 | 386 | 0.000000 | 5.313895 |
| c843b255e98911eb5a8c0062e6b0894b | 293 | 0.000000 | 5.335854 |
| d07abcc5931dc246719c4cb0c7fd53f0 | 185 | 0.000000 | 3.193763 |
| c326cd6e78ae9b11f8fee1bb7ac9b441 | 126 | 0.000000 | 4.025887 |
| 076ad5b91ba9b470ed7126b6f22fe201 | 171 | 0.000000 | 4.367040 |
sample.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, cfe59c8aa4d4bd4344db92d818d04c1f to 4cc93c73069ea5ba716248771a30fffb Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RecencyScore 96096 non-null int64 1 FrequencyScore 96096 non-null float64 2 MonetaryScore 96093 non-null float64 dtypes: float64(2), int64(1) memory usage: 2.9+ MB
# Imputers
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
# Transformers
from sklearn.decomposition import PCA
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
# Separation into numerical and categorical features
numerical_features = sample.select_dtypes(include=['int64', 'float64']).columns
categorical_features = sample.select_dtypes(include=['object', 'bool']).columns
pca = PCA(n_components=2) X_pca = pca.fit_transform(sample) total_explained_variance = pca.explained_varianceratio.cumsum()
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('std_scaler', StandardScaler()),
#('PCA', PCA(n_components=3)),
#('std_scaler', RobustScaler()),
#('std_scaler', MinMaxScaler()),
#('QuantileTransformer', QuantileTransformer()),
])
preprocessing_pipeline = Pipeline(steps=[('numerical_transformer', numerical_transformer)])
index = sample.index
print(index)
Index(['cfe59c8aa4d4bd4344db92d818d04c1f', '6b6c763cc4d02614d74b0c372127fe13',
'e6d08c276c45a22381ec076deeb63b13', '0f1190ffa87d71271b42b9df8245258f',
'448e0dea390e9ed20f330b7e75b12ff3', '65c2b54b418a73fcb36a6b6e6765dd23',
'c06fd806752b504eca5cea51d4bc0c4c', '4f94b2a128e75939c61c188212388e3e',
'10c3a1c4d4262a949a536157ac3c4f6d', '4b0f1a28c6fd7af4705d217c728f22de',
...
'0e8750e4ffa3dc86470cdfa30237f867', 'a9634ba777001268fd432bfdf0f21a10',
'b4a365450bf1efc7d7935fb4d838c3e7', '5270cffd29ac266ad0acc6bf4261321c',
'049978bae2febc352587dab45fb38d3a', '86466e264fde8cca4fd07bec8d83fe96',
'bb5b5f40144184d10b184574c513fcd3', '89baf783e8bf8766ee306ad8aa15ef2a',
'14f5335a43420b92aa77e1f35ada3f42', '4cc93c73069ea5ba716248771a30fffb'],
dtype='object', name='customer_unique_id', length=96096)
columns = sample.columns
print(columns)
Index(['RecencyScore', 'FrequencyScore', 'MonetaryScore'], dtype='object')
sample.iloc[:,0]
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f 69
6b6c763cc4d02614d74b0c372127fe13 517
e6d08c276c45a22381ec076deeb63b13 257
0f1190ffa87d71271b42b9df8245258f 626
448e0dea390e9ed20f330b7e75b12ff3 371
...
86466e264fde8cca4fd07bec8d83fe96 390
bb5b5f40144184d10b184574c513fcd3 60
89baf783e8bf8766ee306ad8aa15ef2a 215
14f5335a43420b92aa77e1f35ada3f42 164
4cc93c73069ea5ba716248771a30fffb 70
Name: RecencyScore, Length: 96096, dtype: int64
sample.iloc[:,1]
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f 0.0
6b6c763cc4d02614d74b0c372127fe13 0.0
e6d08c276c45a22381ec076deeb63b13 0.0
0f1190ffa87d71271b42b9df8245258f 0.0
448e0dea390e9ed20f330b7e75b12ff3 0.0
...
86466e264fde8cca4fd07bec8d83fe96 0.0
bb5b5f40144184d10b184574c513fcd3 0.0
89baf783e8bf8766ee306ad8aa15ef2a 0.0
14f5335a43420b92aa77e1f35ada3f42 0.0
4cc93c73069ea5ba716248771a30fffb 0.0
Name: FrequencyScore, Length: 96096, dtype: float64
sample.iloc[:,2]
customer_unique_id
cfe59c8aa4d4bd4344db92d818d04c1f 3.837946
6b6c763cc4d02614d74b0c372127fe13 4.546481
e6d08c276c45a22381ec076deeb63b13 4.625561
0f1190ffa87d71271b42b9df8245258f 4.969813
448e0dea390e9ed20f330b7e75b12ff3 4.246207
...
86466e264fde8cca4fd07bec8d83fe96 3.654029
bb5b5f40144184d10b184574c513fcd3 5.506632
89baf783e8bf8766ee306ad8aa15ef2a 3.696103
14f5335a43420b92aa77e1f35ada3f42 7.702204
4cc93c73069ea5ba716248771a30fffb 4.101320
Name: MonetaryScore, Length: 96096, dtype: float64
sample_scaled = preprocessing_pipeline.fit_transform(sample)
#sample_scaled = pd.DataFrame(sample_scaled, index = sample.index)
#sample_scaled = pd.DataFrame(sample_scaled, index = sample.index, columns = sample.columns)
fig=plt.figure(figsize=(5,5))
plt.scatter(sample_scaled[:,0], sample_scaled[:,1])
<matplotlib.collections.PathCollection at 0x11d04532400>
sample_scaled.shape
(96096, 3)
#print(sample_scaled)
sample_scaled[:,0].shape
(96096,)
sample_scaled[:,1].shape
(96096,)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import davies_bouldin_score
sse, slc, db = {}, {}, {}
for i in range(2, 7):
kmeans = KMeans(init='k-means++', n_clusters=i, random_state=10, max_iter=400, n_init=50).fit(sample_scaled)
clusters = kmeans.labels_
sse[i] = kmeans.inertia_
slc[i] = silhouette_score(sample_scaled,clusters)
db[i] = davies_bouldin_score(sample_scaled,clusters)
# Graphs for each metric
fig, (ax0, ax1, ax2) = plt.subplots(1,3, figsize = (25, 5))
ax0.plot(list(sse.keys()), list(sse.values()), color="blue", marker='o')
ax0.set_xlabel("number of cluster")
ax0.set_ylabel("inertia")
ax1.plot(list(slc.keys()), list(slc.values()), color="gold", marker='o')
ax1.set_xlabel("number of cluster")
ax1.set_ylabel("silhouette score")
ax2.plot(list(db.keys()), list(db.values()), color="red", marker='o')
ax2.set_xlabel("Number of cluster")
ax2.set_ylabel("Davies-Bouldin score")
Text(0, 0.5, 'Davies-Bouldin score')
from sklearn.manifold import TSNE
import matplotlib.cm as cm
n_clusters=4
sample_scaled = preprocessing_pipeline.fit_transform(sample)
fig, ax2 = plt.subplots(1)
fig.set_size_inches(10, 10)
clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(sample_scaled, cluster_labels)
print("for n clusters =", n_clusters, "average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(sample_scaled, cluster_labels)
# 2nd Plot showing the actual clusters formed
reducer1=TSNE(n_components=2)
data_reduced1 = reducer1.fit_transform(sample_scaled)
sns.scatterplot(data_reduced1[:,0], data_reduced1[:,1], palette='deep', hue=cluster_labels, legend="full",alpha=0.7,ax=ax2)
ax2.set_title("clustered data")
ax2.set_xlabel("feature space for 1st feature")
ax2.set_ylabel("feature space for 2nd feature")
plt.suptitle(("TSNE for KMeans clustering"
"with n clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
for n clusters = 4 average silhouette_score is : 0.34016719280744995
from sklearn.cluster import KMeans
# Définitiondu dataframe preprocessed
sample_scaled = preprocessing_pipeline.fit_transform(sample)
df_sample_scaled = pd.DataFrame(sample_scaled,
index=sample.index,
columns=sample.columns)
n_clusters=4
clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_
# Dataframe of identified clusters
df_cluster_labels = pd.DataFrame(labels_scale, index = df_sample_scaled.index)
df_cluster_labels.columns = ['cluster_labels']
# Distribution of clients in clusters
sample_clustered = df_sample_scaled.join(df_cluster_labels)
sample_clustered['cluster_labels'].value_counts(normalize=True).sort_index().plot.bar(figsize=(9, 4))
<AxesSubplot:>
# Distribution of clients in clusters
sample_clustered['cluster_labels'].value_counts().sort_index()/len(sample_clustered)*100
0 33.844281 1 14.060939 2 25.979229 3 26.115551 Name: cluster_labels, dtype: float64
df_test = sample_clustered.groupby(['cluster_labels']).agg({
'RecencyScore': 'size',
'FrequencyScore':'size',
'MonetaryScore':'size'})
df_test.sort_index().plot.bar(figsize=(9, 4))
<AxesSubplot:xlabel='cluster_labels'>
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.manifold import TSNE
import matplotlib.cm as cm
sample_scaled = preprocessing_pipeline.fit_transform(sample)
fig, ax2 = plt.subplots(1)
fig.set_size_inches(10, 10)
# Compute DBSCAN
db = DBSCAN(eps=0.05, min_samples=10).fit(sample_scaled)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# 2nd Plot showing the actual clusters formed
reducer1=TSNE(n_components=2)
data_reduced1 = reducer1.fit_transform(sample_scaled)
sns.scatterplot(data_reduced1[:,0], data_reduced1[:,1], palette='deep', hue=labels, legend="full",alpha=0.7,ax=ax2)
ax2.set_title("clustered data")
ax2.set_xlabel("feature space for 1st feature")
ax2.set_ylabel("feature space for 2nd feature")
plt.suptitle("TSNE for DBSCAN", fontsize=14, fontweight='bold')
plt.show()
# Définitiondu dataframe preprocessed
sample_imputed = preprocessing_pipeline.fit_transform(sample)
df_sample_imputed = pd.DataFrame(sample_imputed,
index=sample.index,
columns=sample.columns)
sample_scaled = preprocessing_pipeline.fit_transform(sample)
sample_scaled = pd.DataFrame(sample_scaled, index = sample.index, columns = sample.columns)
n_clusters=4
clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_
# Dataframe of identified clusters
df_cluster_labels = pd.DataFrame(cluster_labels, index = sample_scaled.index)
df_cluster_labels.columns = ['cluster_labels']
# hierachical clustering for flexibiliy and small ammount of features (<50)
# random forest for feature importance on a classification problem
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
X, y = df_sample_imputed, df_cluster_labels
# define the model
classeur = RandomForestClassifier()
# fit the model
classeur.fit(X, y)
# get importance
importance = classeur.feature_importances_
df_importance = pd.DataFrame(importance,
index=X.columns,
)
df_importance = df_importance.rename(columns={0: 'importance'})
df_importance = df_importance.sort_values(by = ['importance'], ascending=False)
# plot feature importance
# .nlargest(4) # pandas Series method which will return a subset of the series with the largest n values.
sns.barplot(x='importance',
y=df_importance.index, data=df_importance)
C:\Users\tropical noise\AppData\Local\Temp\ipykernel_11864\1449054764.py:10: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). classeur.fit(X, y)
<AxesSubplot:xlabel='importance'>
# Distribution of clients in clusters
sample_clustered['cluster_labels'].value_counts().sort_index()/len(sample_clustered)*100
0 33.844281 1 14.060939 2 25.979229 3 26.115551 Name: cluster_labels, dtype: float64
# Parallel Coordinates Plots pour la caractérisation des clusters
import plotly.graph_objects as go
fig = go.Figure(data=
go.Parcoords(
line = dict(color = sample_clustered['cluster_labels'],
colorscale = [[0,'red'],[0.1,'blue'],[0.2,'yellow'], [0.3,'green'],[0.4,'darkgreen'],
[0.5,'orange'], [0.6,'purple'],[0.7,'lightblue'],[1,'gold']]),
dimensions = list([
dict(range = [sample_clustered['RecencyScore'].min(),sample_clustered['RecencyScore'].max()],
#constraintrange = [100000,150000],
label = "RecencyScore", values = sample_clustered['RecencyScore']),
dict(range = [-1,6],
tickvals = [0,1,2,3,4,5],
label = 'cluster_labels', values = sample_clustered['cluster_labels']),
dict(range = [sample_clustered['FrequencyScore'].min(),sample_clustered['FrequencyScore'].max()],
#constraintrange = [100000,150000],
label = "FrequencyScore", values = sample_clustered['FrequencyScore']),
dict(range = [sample_clustered['MonetaryScore'].min(),sample_clustered['MonetaryScore'].max()],
#constraintrange = [100000,150000],
label = "MonetaryScore", values = sample_clustered['MonetaryScore'])])
)
)
fig.update_layout(
plot_bgcolor = 'white',
paper_bgcolor = 'white'
)
fig.show()
# Valeurs moyennes par variables et par clusters
column_names = list(sample_clustered.columns)
df_mean = pd.DataFrame(columns = column_names)
for i in range(4):
df_mean = df_mean.append((pd.DataFrame(sample_clustered[sample_clustered['cluster_labels']==i].mean())).transpose())
df_mean.head()
| RecencyScore | FrequencyScore | MonetaryScore | cluster_labels | |
|---|---|---|---|---|
| 0 | -0.58871 | -0.361765 | -0.781065 | 0.0 |
| 0 | -0.045746 | 2.219526 | 1.12246 | 1.0 |
| 0 | -0.46266 | -0.38751 | 0.690989 | 2.0 |
| 0 | 1.24781 | -0.340706 | -0.279512 | 3.0 |
df_mean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4 entries, 0 to 0 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RecencyScore 4 non-null object 1 FrequencyScore 4 non-null object 2 MonetaryScore 4 non-null object 3 cluster_labels 4 non-null object dtypes: object(4) memory usage: 160.0+ bytes
df_mean['MonetaryScore'].min()
-0.7810652642820877
# Parallel Coordinates Plots pour la caractérisation des clusters
import plotly.graph_objects as go
fig = go.Figure(data=
go.Parcoords(
line = dict(color = df_mean['cluster_labels'],
colorscale = [[0,'red'],[0.1,'blue'],[0.2,'yellow'], [0.3,'green'],[0.4,'darkgreen'],
[0.5,'orange'], [0.6,'purple'],[0.7,'lightblue'],[1,'gold']]),
dimensions = list([
dict(range = [0,4],
tickvals = [0,1,2,3],
label = 'cluster_labels', values = df_mean['cluster_labels']),
dict(range = [sample_clustered['MonetaryScore'].min(),sample_clustered['MonetaryScore'].max()],
label = "MonetaryScore", values = sample_clustered['MonetaryScore'].astype('int64')),
dict(range = [df_mean['FrequencyScore'].min(),df_mean['FrequencyScore'].max()],
label = "FrequencyScore", values = df_mean['FrequencyScore'].astype('int64')),
dict(range = [sample_clustered['RecencyScore'].min(),sample_clustered['RecencyScore'].max()],
label = "RecencyScore", values = sample_clustered['RecencyScore'].astype('int64'))])
)
)
fig.update_layout(
plot_bgcolor = 'white',
paper_bgcolor = 'white'
)
fig.show()
rfm.head()
| RecencyScore | FrequencyScore | MonetaryScore | |
|---|---|---|---|
| customer_unique_id | |||
| 0000366f3b9a7992bf8c76cfdf3221e2 | 160 | 0.0 | 4.955123 |
| 0000b849f77a49e4a4ce2b2a4ca5be3f | 163 | 0.0 | 3.302849 |
| 0000f46a3911fa3c0805444483337064 | 585 | 0.0 | 4.456902 |
| 0000f6ccb0745a6a4b88665a16c9f078 | 369 | 0.0 | 3.775516 |
| 0004aac84e0df4da2b147fca70cf8255 | 336 | 0.0 | 5.282645 |
rfm.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RecencyScore 96096 non-null int64 1 FrequencyScore 96096 non-null float64 2 MonetaryScore 96093 non-null float64 dtypes: float64(2), int64(1) memory usage: 4.9+ MB
sample_scaled = preprocessing_pipeline.fit_transform(rfm)
# transformation into dataframe
df_customer_dataset_reduced = pd.DataFrame(sample_scaled,
index = rfm.index,
columns = rfm.columns)
df_customer_dataset_reduced_time = pd.merge(model[['purchase_year', 'purchase_month']],
df_customer_dataset_reduced,
left_index=True, right_index=True, how = 'outer')
n_clusters=4
clusterer = KMeans(n_clusters=n_clusters, random_state=10, max_iter=400, n_init=50)
cluster_labels = clusterer.fit_predict(sample_scaled)
labels_scale = clusterer.labels_
cluster_labels = clusterer.fit_predict(sample_scaled)
df_cluster_labels = pd.DataFrame(cluster_labels, index = df_customer_dataset_reduced.index)
df_cluster_labels.rename(columns={0: "clusters"}, inplace = True)
df_customer_dataset_reduced_time.info()
<class 'pandas.core.frame.DataFrame'> Index: 96096 entries, 0000366f3b9a7992bf8c76cfdf3221e2 to ffffd2657e2aad2907e67c3e9daecbeb Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 purchase_year 96096 non-null float64 1 purchase_month 96096 non-null float64 2 RecencyScore 96096 non-null float64 3 FrequencyScore 96096 non-null float64 4 MonetaryScore 96096 non-null float64 dtypes: float64(5) memory usage: 6.4+ MB
from sklearn.metrics.cluster import adjusted_rand_score
# list of purchase_year and purchase_month
year = (df_customer_dataset_reduced_time.purchase_year.astype(int).unique()).tolist()
month = (df_customer_dataset_reduced_time.purchase_month.astype(int).unique()).tolist()
datasets = []
ari = []
# for each month of each year we extract the corresponding part of the dataset and we store it in a list of monthly datasets
for y in year:
for m in month:
data = df_customer_dataset_reduced_time.loc[(df_customer_dataset_reduced_time.purchase_year == y)
& (df_customer_dataset_reduced_time.purchase_month == m),:]
datasets.append(data)
df_merge = []
# for each element of the monthly dataset list, calculate the ARI score between the labels of a monthly dataset
# and those of the global dataset for the same period
for k in range(len(datasets)):
df_merge.append(datasets[k])
if len(df_merge) == 1 :
df_final = df_merge[0]
if len(df_final) >= 4:
df_final.drop(['purchase_year', 'purchase_month'], axis = 1 , inplace = True)
clusterer.fit(df_final)
labels_true = df_cluster_labels.loc[df_final.index, 'clusters'].tolist()
ari.append(adjusted_rand_score(labels_true, clusterer.labels_))
else:
df_final = pd.concat(df_merge)
if len(df_final) >= 4:
df_final.drop(['purchase_year', 'purchase_month'], axis = 1 , inplace = True)
clusterer.fit(df_final)
labels_true = df_cluster_labels.loc[df_final.index, 'clusters'].tolist()
ari.append(adjusted_rand_score(labels_true, clusterer.labels_))
ari.reverse()
#plot of results
fig = plt.figure(figsize=(16, 4))
plt.plot(range(len(ari)), ari, marker='o')
plt.xlabel('nombre de mois')
plt.ylabel('indice ARI')
C:\Users\tropical noise\AppData\Local\Temp\ipykernel_11864\96580728.py:26: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Text(0, 0.5, 'indice ARI')